Synopsis

Evaluation of the third round of scoring Miguel Vázquez did for the DSP.

Libraries

dyn.load('/Library/Java/JavaVirtualMachines/jdk1.8.0_131.jdk/Contents/Home/jre/lib/server/libjvm.dylib')
library(rJava)
require(data.table)
require(ggplot2)
require(xlsx)
require(gridExtra)
require(splitstackshape)

Upload data

dsp_scored_rd1 <- unique(fread("../round1/DarkSpace-v1.txt",sep="\t",header=T,skip=1))
dsp_scored_rd2 <- unique(data.table(read.xlsx2("../round2/DarkSpace_rank2.xls",sheetIndex=1,header=T,colClasses = c("character","numeric","character","integer","character","numeric","numeric"))))
dsp_scored_rd3 <- unique(fread("./pmid_ranks.txt",header=T))

Preformat latest evaluation

colnames(dsp_scored_rd3) <- tolower(gsub("-| ","_",colnames(dsp_scored_rd3)))
colnames(dsp_scored_rd3) <- tolower(gsub("#|\\(|\\)","",colnames(dsp_scored_rd3)))

dsp_scored_rd3 <- dsp_scored_rd3[,known:=ifelse(known_pairs=="",
                                                "false",
                                                "true")]
table(dsp_scored_rd3$known,useNA = "ifany")
## 
## false  true 
## 80446 33869

Explore data

I check the distribution of the relevance and the combined score separating IMEx positive and negative publications and comparing the previous iterations with the current one.

Relevance score plot

g1 <- ggplot(dsp_scored_rd1,aes(x=Relevance,fill=IMEX))
g1 <- g1 + geom_histogram(alpha=0.8,position='identity')
g1 <- g1 + xlab("relevance score")
g1 <- g1 + ylab("Number of publications")
g1 <- g1 + ggtitle("Relevance score distribution, round 1")
#g1 <- g1 + xlim(0.0,2.0)
g1 <- g1 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())

g2 <- ggplot(dsp_scored_rd2,aes(x=Relevance,fill=IMEX))
g2 <- g2 + geom_histogram(alpha=0.8,position='identity')
g2 <- g2 + xlab("relevance score")
g2 <- g2 + ylab("Number of publications")
g2 <- g2 + ggtitle("Relevance score distribution, round 2")
#g2 <- g2 + xlim(0.0,2.0)
g2 <- g2 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())

g3 <- ggplot(dsp_scored_rd3,aes(x=relevance,fill=known))
g3 <- g3 + geom_histogram(alpha=0.8,position='identity')
g3 <- g3 + xlab("relevance score")
g3 <- g3 + ylab("Number of publications")
g3 <- g3 + ggtitle("Relevance score distribution, round 3")
g3 <- g3 + scale_x_continuous(breaks=c(0:24))
g3 <- g3 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())

grid.arrange(g1, g2, g3, ncol=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

g1.1 <- ggplot(dsp_scored_rd1,aes(x=Relevance,fill=IMEX))
g1.1 <- g1.1 + geom_density(alpha=0.8,position='identity')
g1.1 <- g1.1 + xlab("relevance score")
g1.1 <- g1.1 + ylab("Number of publications")
g1.1 <- g1.1 + ggtitle("Relevance score distribution, round 1")
#g1.1 <- g1.1 + xlim(0.0,2.0)
g1.1 <- g1.1 + ylim(0,5)
g1.1 <- g1.1 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())

g2.1 <- ggplot(dsp_scored_rd2,aes(x=Relevance,fill=IMEX))
g2.1 <- g2.1 + geom_density(alpha=0.8,position='identity')
g2.1 <- g2.1 + xlab("relevance score")
g2.1 <- g2.1 + ylab("Number of publications")
g2.1 <- g2.1 + ggtitle("Relevance score distribution, round 2")
#g2.1 <- g2.1 + xlim(0.0,2.0)
g2.1 <- g2.1 + ylim(0,5)
g2.1 <- g2.1 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())

g3.1 <- ggplot(dsp_scored_rd3,aes(x=relevance,fill=known))
g3.1 <- g3.1 + geom_density(alpha=0.8,position='identity')
g3.1 <- g3.1 + xlab("relevance score")
g3.1 <- g3.1 + ylab("Number of publications")
g3.1 <- g3.1 + ggtitle("Relevance score distribution, round 3")
g3.1 <- g3.1 + scale_x_continuous(breaks=c(0:24))
g3.1 <- g3.1 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())

grid.arrange(g1.1, g2.1, g3.1, ncol=1)

The re-calculated score is certainly distributed differently from previous instances. It is also a more complex approach, so I need to explore other angles.

g3.2 <- ggplot(dsp_scored_rd3,aes(x=relevance,y=dark_space_interest,colour=known))
g3.2 <- g3.2 + geom_point(alpha=0.5)
g3.2 <- g3.2 + xlab("relevance score")
g3.2 <- g3.2 + ylab("dsp interest (truncated top at 100)")
g3.2 <- g3.2 + ggtitle("Relevance score vs Dark Space interest")
g3.2 <- g3.2 + ylim(0.0,100)
g3.2 <- g3.2 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())

g3.3 <- ggplot(dsp_scored_rd3,aes(x=relevance,y=dark_space_interest,colour=known))
g3.3 <- g3.3 + geom_smooth(alpha=0.5)
g3.3 <- g3.3 + xlab("relevance score")
g3.3 <- g3.3 + ylab("dsp interest")
g3.3 <- g3.3 + ggtitle("Relevance score vs Dark Space interest, smooth plot")
#g3.3 <- g3.3 + ylim(0.0,7)
g3.3 <- g3.3 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())

g3.4 <- ggplot(dsp_scored_rd3,aes(x=relevance,y=dark_space_interest,colour=known))
g3.4 <- g3.4 + geom_smooth(method="lm",formula=y~x,alpha=0.5)
g3.4 <- g3.4 + xlab("relevance score")
g3.4 <- g3.4 + ylab("dsp interest")
g3.4 <- g3.4 + ggtitle("Relevance score vs Dark Space interest, LM fitted")
#g3.4 <- g3.4 + ylim(0.0,7)
g3.4 <- g3.4 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())

grid.arrange(g3.2, g3.3, g3.4, ncol=1)
## Warning: Removed 5 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'gam'

g3.5 <- ggplot(dsp_scored_rd3,aes(x=relevance,y=dark_space_partial_interest,colour=known))
g3.5 <- g3.5 + geom_point(alpha=0.5)
g3.5 <- g3.5 + xlab("relevance score")
g3.5 <- g3.5 + ylab("dsp partial interest (truncated top at 100)")
g3.5 <- g3.5 + ggtitle("Relevance score vs Dark Space interest")
g3.5 <- g3.5 + ylim(0.0,100)
g3.5 <- g3.5 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())

g3.6 <- ggplot(dsp_scored_rd3,aes(x=relevance,y=dark_space_partial_interest,colour=known))
g3.6 <- g3.6 + geom_smooth(alpha=0.5)
g3.6 <- g3.6 + xlab("relevance score")
g3.6 <- g3.6 + ylab("dsp partial interest")
g3.6 <- g3.6 + ggtitle("Relevance score vs Dark Space interest, smooth plot")
#g3.6 <- g3.6 + ylim(0.0,7)
g3.6 <- g3.6 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())

g3.7 <- ggplot(dsp_scored_rd3,aes(x=relevance,y=dark_space_partial_interest,colour=known))
g3.7 <- g3.7 + geom_smooth(method="lm",formula=y~x,alpha=0.5)
g3.7 <- g3.7 + xlab("relevance score")
g3.7 <- g3.7 + ylab("dsp partial interest")
g3.7 <- g3.7 + ggtitle("Relevance score vs Dark Space interest, LM fitted")
#g3.7 <- g3.7 + ylim(0.0,7)
g3.7 <- g3.7 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())

grid.arrange(g3.5, g3.6, g3.7, ncol=1)
## Warning: Removed 103 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'gam'

It seems the interest scores plot against the relevance score as expected, having highly interesting proteins in the ‘darkest’ areas of the dataset.

Checking distribution of scores by data source

Obtain DSP comparison table for reference

I will take the table that does the comparison at the publication level only for this comparison.

if(!exists("dsp_pubcomp")){
        setwd("~/Documents/Projects/dsp/darkspaceproject/dsp_comparison/results/")
        dsp_pubcompNames = fread('cat pubcomp_table_final.txt.gz | gunzip | head -n 1')[, colnames(.SD)]
        dsp_pubcomp = fread('cat pubcomp_table_final.txt.gz | gunzip | grep -v "^Day"')
        setnames(dsp_pubcomp, dsp_pubcompNames)
        setwd("~/Documents/Projects/dsp/DarkSpace/manual_evaluation/round2/")
}
Incorporating & formatting dataset info to ranked lists
dsp_scored_rd3$pmid <- as.character(dsp_scored_rd3$pmid)
dsp_scored_rd3_ori <- unique(merge(dsp_scored_rd3,dsp_pubcomp,by = "pmid",all.x=T,all.y=F))
dsp_scored_rd3_ori_sel <- dsp_scored_rd3_ori[,.(pmid,
                                                relevance,
                                                known,
                                                db_score,
                                                dark_space_interest,
                                                dark_space_partial_interest,
                                                imex = gsub("1","IMEx",imex.y),
                                                reactome = gsub("1","reactome",reactome.y),
                                                tm_epmc = gsub("1","tm_epmc",tm_epmc),
                                                EVEX = gsub("1","EVEX",EVEX),
                                                BioGRID = gsub("1","BioGRID",BioGRID),
                                                GO_IPI = gsub("1","GO_IPI",GO_IPI),
                                                OmniPath_interactions = gsub("1","OmniPath_interactions",OmniPath_interactions),
                                                OmniPath_ptm = gsub("1","OmniPath_ptm",OmniPath_ptm))]

fwrite(dsp_scored_rd3_ori_sel,"./dsp_scored_rd3_ori_sel.txt",col.names = T,row.names = F,sep="\t",quote = F)

dsp_scored_rd3_ori_long <- reshape(dsp_scored_rd3_ori_sel,direction="long",v.names="origin",varying=c("imex","reactome","tm_epmc","EVEX","BioGRID","GO_IPI","OmniPath_interactions","OmniPath_ptm"))
dsp_scored_rd3_ori_long_sel <- unique(dsp_scored_rd3_ori_long[order(pmid,-origin),.(pmid,relevance,known,db_score,dark_space_interest,dark_space_partial_interest,origin,id)])

dsp_scored_rd3_ori_long_sel$select <- "yes"

for (i in 2:nrow(dsp_scored_rd3_ori_long_sel)){
        if(dsp_scored_rd3_ori_long_sel[i,]$pmid == dsp_scored_rd3_ori_long_sel[i-1,]$pmid & dsp_scored_rd3_ori_long_sel[i,]$origin=="0"){
                dsp_scored_rd3_ori_long_sel[i,]$select <- "no"
        }
        
}

dsp_scored_rd3_ori_long_final <- dsp_scored_rd3_ori_long_sel[select=="yes",.(pmid,relevance,known,db_score,dark_space_interest,dark_space_partial_interest,origin,id)]

table(dsp_scored_rd3_ori_long_final$known,dsp_scored_rd3_ori_long_final$origin,useNA = "ifany")
##        
##         BioGRID  EVEX GO_IPI  IMEx OmniPath_interactions OmniPath_ptm
##   false     741 44947    191   385                  5145         1750
##   true    23498  9333   4841  8594                  1450          856
##        
##         reactome tm_epmc
##   false     1941   30448
##   true      1102    3596

The

Plots origin comparison
g3.8 <- ggplot(dsp_scored_rd3_ori_long_final,aes(x=relevance,fill=known))
g3.8 <- g3.8 + geom_histogram(alpha=0.8,position='identity')
g3.8 <- g3.8 + xlab("relevance score")
g3.8 <- g3.8 + ylab("Number of publications")
g3.8 <- g3.8 + ggtitle("Relevance score distribution, round 3")
g3.8 <- g3.8 + facet_grid(origin~.,scales="free_y")
g3.8 <- g3.8 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())

g3.8
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

g3.9 <- ggplot(dsp_scored_rd3_ori_long_final,aes(x=relevance,fill=known))
g3.9 <- g3.9 + geom_density(alpha=0.9,position='identity')
g3.9 <- g3.9 + xlab("relevance score")
g3.9 <- g3.9 + ylab("Number of publications")
g3.9 <- g3.9 + ggtitle("Relevance score distribution, round 3")
g3.9 <- g3.9 + facet_grid(origin~.,scales="free_y")
g3.9 <- g3.9 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())

g3.9

Checking dsp interest vs relevance score

g3.10 <- ggplot(dsp_scored_rd3_ori_long_final,aes(x=relevance,y=dark_space_interest,colour=known))
g3.10 <- g3.10 + geom_point(alpha=0.5)
g3.10 <- g3.10 + xlab("relevance score")
g3.10 <- g3.10 + ylab("dsp interest (truncated top at 100)")
g3.10 <- g3.10 + ggtitle("Relevance score vs Dark Space interest, per origin")
g3.10 <- g3.10 + facet_grid(origin~.,scales="free_y")
g3.10 <- g3.10 + ylim(0.0,100)
g3.10 <- g3.10 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())
g3.10
## Warning: Removed 7 rows containing missing values (geom_point).

g3.11 <- ggplot(dsp_scored_rd3_ori_long_final,aes(x=relevance,y=dark_space_interest,colour=known))
g3.11 <- g3.11 + geom_smooth(alpha=0.5)
g3.11 <- g3.11 + xlab("relevance score")
g3.11 <- g3.11 + ylab("dsp interest")
g3.11 <- g3.11 + ggtitle("Relevance score vs Dark Space interest, smooth plot")
g3.11 <- g3.11 + facet_grid(origin~.,scales="free_y")
g3.11 <- g3.11 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())
g3.11
## `geom_smooth()` using method = 'gam'

g3.12 <- ggplot(dsp_scored_rd3_ori_long_final,aes(x=relevance,y=dark_space_interest,colour=known))
g3.12 <- g3.12 + geom_smooth(method="lm",formula=y~x,alpha=0.5)
g3.12 <- g3.12 + xlab("relevance score")
g3.12 <- g3.12 + ylab("dsp interest")
g3.12 <- g3.12 + ggtitle("Relevance score vs Dark Space interest, LM fitted")
g3.12 <- g3.12 + facet_grid(origin~.,scales="free_y")
g3.12 <- g3.12 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())
g3.12

Comparison with round 1

After manual evaluation, it seems the relevance score has lost the power to predict whether a publication contains interactions or not. I will compare to the first round of scoring to see where lie the differences.

Merge datasets

dsp_scored_rd1$pmid <- as.character(dsp_scored_rd1$`#ID`)
dsp_scored_rd1_sel <- dsp_scored_rd1[,.(pmid,
                                        rel_rd1=Relevance,
                                        IMEX,
                                        Coverage,
                                        Proteins,
                                        prot_interest = `Protein interest`,
                                        comb_score = `Combined score`)]
        
dsp_scored_rd1_plus_3 <- data.table(unique(merge(dsp_scored_rd1_sel,dsp_scored_rd3_ori_sel,by="pmid",all = T)))

table(dsp_scored_rd1_plus_3$known,dsp_scored_rd1_plus_3$IMEX,useNA = "ifany")
##        
##         false  true
##   false 80061   385
##   true  25275  8594

There are 385 IMEx entries that are not in the ‘known’ set. How is that possible? Most importantly, over 25,000 known publications are not part of the IMEx dataset. This will be mostly BioGRID data.

Plot relevance rd1 vs rd3

First I will explore how the different relevance scores relate to each other.

g4 <- ggplot(dsp_scored_rd1_plus_3,aes(x=relevance,y=rel_rd1,colour=known,group=IMEX))
#g4 <- g4 + geom_point(alpha=0.5,aes(shape=IMEX))
g4 <- g4 + geom_point(alpha=0.5)
g4 <- g4 + xlab("relevance score rd3")
g4 <- g4 + ylab("relevance score rd1")
g4 <- g4 + ggtitle("Relevance score rd1 vs rd3")
g4 <- g4 + facet_grid(IMEX~.,scales="free_y")
#g4 <- g4 + ylim(0.0,100)
g4 <- g4 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())
g4

Now I represent the same plot by source data. I need to wrangle the comparison to ‘long’ format first.

dsp_scored_rd1_plus_3 <- dsp_scored_rd1_plus_3[,in_IMEx:=ifelse(IMEX=="true",
                                                                 "IMEx",
                                                                 "0")]
dsp_scored_rd1_plus_3_long <- reshape(dsp_scored_rd1_plus_3,direction="long",v.names="origin",varying=c("in_IMEx","reactome","tm_epmc","EVEX","BioGRID","GO_IPI","OmniPath_interactions","OmniPath_ptm"))
dsp_scored_rd1_plus_3_long_sel <- unique(dsp_scored_rd1_plus_3_long[order(pmid,-origin),.(pmid,
                                                                       rel_rd1,
                                                                       relevance,
                                                                       known,
                                                                       origin,
                                                                       db_score,
                                                                       dark_space_interest,
                                                                       dark_space_partial_interest)])

dsp_scored_rd1_plus_3_long_sel$select <- "yes"

for (i in 2:nrow(dsp_scored_rd1_plus_3_long_sel)){
        if(dsp_scored_rd1_plus_3_long_sel[i,]$pmid == dsp_scored_rd1_plus_3_long_sel[i-1,]$pmid & dsp_scored_rd1_plus_3_long_sel[i,]$origin=="0"){
                dsp_scored_rd1_plus_3_long_sel[i,]$select <- "no"
        }
        
}

dsp_scored_rd1_plus_3_long_final <- dsp_scored_rd1_plus_3_long_sel[select=="yes",.(pmid,rel_rd1,relevance,known,db_score,dark_space_interest,dark_space_partial_interest,origin)]

table(dsp_scored_rd1_plus_3_long_final$known,dsp_scored_rd1_plus_3_long_final$origin,useNA="ifany")
##        
##         BioGRID  EVEX GO_IPI  IMEx OmniPath_interactions OmniPath_ptm
##   false     741 44947    191   385                  5145         1750
##   true    23498  9333   4841  8594                  1450          856
##        
##         reactome tm_epmc
##   false     1941   30448
##   true      1102    3596
g5 <- ggplot(dsp_scored_rd1_plus_3_long_final,aes(x=relevance,y=rel_rd1,colour=origin,group=known))
#g5 <- g5 + geom_point(alpha=0.5,aes(shape=IMEX))
g5 <- g5 + geom_point(alpha=0.2)
g5 <- g5 + xlab("relevance score rd3")
g5 <- g5 + ylab("relevance score rd1")
g5 <- g5 + ggtitle("Relevance score rd1 vs rd3")
g5 <- g5 + facet_grid(known~.)
#g5 <- g5 + ylim(0.0,100)
g5 <- g5 + theme(plot.title = element_text(),
               panel.grid.major = element_blank(),
               panel.grid.minor = element_blank(),
               panel.background = element_blank())
g5